For this challenge, we have decided to explore several approaches in order to predict the top hits of 2019 :
We have considered various data sources for our research :
For this challenge, we considered 3 main open source libraries :
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import plotly.plotly as py
import plotly.graph_objs as go
import IPython.display as ipd
import requests
import lyricsgenius as genius
from glob import glob
import os.path as op
from nltk.corpus import stopwords
import re
import itertools
from wordcloud import WordCloud, STOPWORDS
from sklearn import model_selection
from sklearn.svm import LinearSVC
import pickle
from bs4 import BeautifulSoup
import seaborn as sns
from matplotlib import rc
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
import requests, json, logging
import pandas as pd
import base64
import six
from sklearn import linear_model
# Load datas from the Spotify Top Charts : https://spotifycharts.com/regional
folder = "/Users/maelfabien/TelecomParisTech/INFMDI721/Hackathon/Total/"
onlyfiles = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
print("Working with {0} csv files".format(len(onlyfiles)))
#Append all files
data = []
for file in onlyfiles :
if file != '.DS_Store' :
df = pd.read_csv(folder + file, skiprows=1)
df['country'] = file[9:11]
df['week'] = file[19:20]
data.append(df)
data = pd.concat(data, axis=0)
data.head(10)
#Groups datas by country, over all weeks collected : 4 weeks
data_country = data.groupby(['country'])[["Streams"]].sum().sort_values('Streams', ascending=True)
#Group datas by Artist, over all weeks collected : 4 weeks
data_artists = data.groupby(['Artist'])[["Streams"]].sum().sort_values('Streams', ascending=False)
trace0 = go.Bar(
x=data_country.index,
y=data_country['Streams'],
text=data_country['Streams'],
marker=dict(
color='rgb(158,202,225)',
line=dict(
color='rgb(8,48,107)',
width=1.5,
)
),
opacity=0.6
)
data = [trace0]
layout = go.Layout(
title='Regions of the world that consume most music',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='text-hover-bar')
Over the past month, the US market has consumed as much music streaming as Germany, Great Britain, France, Astralia, Netherlands, Canada and Japan together.
#plt.bar(data_country.index, data_country['Streams'])
#plt.title('Number of monthly streams')
#plt.show()
trace0 = go.Bar(
x=data_artists.head(100).sort_values('Streams', ascending=True).index,
y=data_artists.head(100).sort_values('Streams', ascending=True).Streams,
text=data_artists.sort_values('Streams', ascending=True).Streams,
marker=dict(
color='rgb(158,202,225)',
line=dict(
color='rgb(8,48,107)',
width=1.5,
)
),
opacity=0.6
)
data = [trace0]
layout = go.Layout(
title='Sales by most popular artist this month in top 10 regions',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='text-hover-bar')
This past month, the most popular artists in the charts were :
ipd.Audio('ariana_grande_-_thank_u__next.wav') # load a local WAV file